# We load the libraries that will be used.
library(caTools)
library(corrplot)
## corrplot 0.92 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(foreign)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(leaflet)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(viridis)
## Loading required package: viridisLite
# We load our data set.
Houses <- read.arff("house_sales_reduced.arff")
Houses[1:9,]
## attribute_0 id price bedrooms bathrooms sqft_living sqft_lot floors
## 1 0 7129300520 221900 3 1.00 1180 5650 1
## 2 1 6414100192 538000 3 2.25 2570 7242 2
## 3 2 5631500400 180000 2 1.00 770 10000 1
## 4 3 2487200875 604000 4 3.00 1960 5000 1
## 5 4 1954400510 510000 3 2.00 1680 8080 1
## 6 5 7237550310 1225000 4 4.50 5420 101930 1
## 7 6 1321400060 257500 3 2.25 1715 6819 2
## 8 7 2008000270 291850 3 1.50 1060 9711 1
## 9 8 2414600126 229500 3 1.00 1780 7470 1
## waterfront view condition grade sqft_above sqft_basement yr_built
## 1 0 0 3 7 1180 0 1955
## 2 0 0 3 7 2170 400 1951
## 3 0 0 3 6 770 0 1933
## 4 0 0 5 7 1050 910 1965
## 5 0 0 3 8 1680 0 1987
## 6 0 0 3 11 3890 1530 2001
## 7 0 0 3 7 1715 0 1995
## 8 0 0 3 7 1060 0 1963
## 9 0 0 3 7 1050 730 1960
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 0 98178 47.5112 -122.257 1340 5650
## 2 1991 98125 47.7210 -122.319 1690 7639
## 3 0 98028 47.7379 -122.233 2720 8062
## 4 0 98136 47.5208 -122.393 1360 5000
## 5 0 98074 47.6168 -122.045 1800 7503
## 6 0 98053 47.6561 -122.005 4760 101930
## 7 0 98003 47.3097 -122.327 2238 6819
## 8 0 98198 47.4095 -122.315 1650 9711
## 9 0 98146 47.5123 -122.337 1780 8113
# Atributes names.
names(Houses)
## [1] "attribute_0" "id" "price" "bedrooms"
## [5] "bathrooms" "sqft_living" "sqft_lot" "floors"
## [9] "waterfront" "view" "condition" "grade"
## [13] "sqft_above" "sqft_basement" "yr_built" "yr_renovated"
## [17] "zipcode" "lat" "long" "sqft_living15"
## [21] "sqft_lot15"
summary(Houses)
## attribute_0 id price bedrooms
## Min. : 0 Min. :1.000e+06 Min. : 75000 Min. : 0.000
## 1st Qu.: 5403 1st Qu.:2.123e+09 1st Qu.: 321950 1st Qu.: 3.000
## Median :10806 Median :3.905e+09 Median : 450000 Median : 3.000
## Mean :10806 Mean :4.580e+09 Mean : 540088 Mean : 3.371
## 3rd Qu.:16209 3rd Qu.:7.309e+09 3rd Qu.: 645000 3rd Qu.: 4.000
## Max. :21612 Max. :9.900e+09 Max. :7700000 Max. :33.000
##
## bathrooms sqft_living sqft_lot floors
## Min. :0.000 Min. : 290 Min. : 520 Min. :1.000
## 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040 1st Qu.:1.000
## Median :2.250 Median : 1910 Median : 7618 Median :1.500
## Mean :2.115 Mean : 2080 Mean : 15107 Mean :1.494
## 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688 3rd Qu.:2.000
## Max. :8.000 Max. :13540 Max. :1651359 Max. :3.500
##
## waterfront view condition grade
## Min. :0.000000 Min. :0.0000 Min. :1.000 Min. : 1.000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 7.000
## Median :0.000000 Median :0.0000 Median :3.000 Median : 7.000
## Mean :0.007542 Mean :0.2343 Mean :3.409 Mean : 7.657
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :1.000000 Max. :4.0000 Max. :5.000 Max. :13.000
##
## sqft_above sqft_basement yr_built yr_renovated
## Min. : 290 Min. : 0.0 Min. :1900 Min. : 0.0
## 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.0
## Median :1560 Median : 0.0 Median :1975 Median : 0.0
## Mean :1788 Mean : 291.5 Mean :1971 Mean : 84.4
## 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997 3rd Qu.: 0.0
## Max. :9410 Max. :4820.0 Max. :2015 Max. :2015.0
##
## zipcode lat long sqft_living15
## Min. :98001 Min. :47.16 Min. :-122.5 Min. : 399
## 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3 1st Qu.:1490
## Median :98065 Median :47.57 Median :-122.2 Median :1840
## Mean :98078 Mean :47.56 Mean :-122.2 Mean :1987
## 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1 3rd Qu.:2360
## Max. :98199 Max. :47.78 Max. :-121.3 Max. :6210
##
## sqft_lot15
## 5000 : 427
## 4000 : 357
## 6000 : 289
## 7200 : 211
## 4800 : 145
## 7500 : 142
## (Other):20042
The following command allow us to see the data types of all the variables.
str(Houses)
## 'data.frame': 21613 obs. of 21 variables:
## $ attribute_0 : num 0 1 2 3 4 5 6 7 8 9 ...
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : num 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num 1180 2570 770 1960 1680 ...
## $ sqft_lot : num 5650 7242 10000 5000 8080 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num 1180 2170 770 1050 1680 ...
## $ sqft_basement: num 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num 0 1991 0 0 0 ...
## $ zipcode : num 98178 98125 98028 98136 98074 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: num 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : Factor w/ 8689 levels "10000","10001",..: 5533 6933 7277 5064 6813 143 6321 8506 7326 6873 ...
# Converting 'sqft_lot15' to numerical since it has too many levels.
Houses$sqft_lot15 <- as.numeric(Houses$sqft_lot15)
# We convert the square feet into square meters using the fact that 1 ft^2 = 0.09290304 m^2 .
Houses[,"sqft_living"] <- Houses[,"sqft_living"]*0.09290304
Houses[,"sqft_lot"] <- Houses[,"sqft_lot"]*0.09290304
Houses[,"sqft_above"] <- Houses[,"sqft_above"]*0.09290304
Houses[,"sqft_basement"] <- Houses[,"sqft_basement"]*0.09290304
Houses[,"sqft_living15"] <- Houses[,"sqft_living15"]*0.09290304
Houses[,"sqft_lot15"] <- Houses[,"sqft_lot15"]*0.09290304
# We rename the columns to fit the new data.
names(Houses)[6] = "sqm_living"
names(Houses)[7] = "sqm_lot"
names(Houses)[13] = "sqm_above"
names(Houses)[14] = "sqm_basement"
names(Houses)[20] = "sqm_living15"
names(Houses)[21] = "sqm_lot15"
Houses[1:9,]
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot floors
## 1 0 7129300520 221900 3 1.00 109.62559 524.9022 1
## 2 1 6414100192 538000 3 2.25 238.76081 672.8038 2
## 3 2 5631500400 180000 2 1.00 71.53534 929.0304 1
## 4 3 2487200875 604000 4 3.00 182.08996 464.5152 1
## 5 4 1954400510 510000 3 2.00 156.07711 750.6566 1
## 6 5 7237550310 1225000 4 4.50 503.53448 9469.6069 1
## 7 6 1321400060 257500 3 2.25 159.32871 633.5058 2
## 8 7 2008000270 291850 3 1.50 98.47722 902.1814 1
## 9 8 2414600126 229500 3 1.00 165.36741 693.9857 1
## waterfront view condition grade sqm_above sqm_basement yr_built yr_renovated
## 1 0 0 3 7 109.62559 0.00000 1955 0
## 2 0 0 3 7 201.59960 37.16122 1951 1991
## 3 0 0 3 6 71.53534 0.00000 1933 0
## 4 0 0 5 7 97.54819 84.54177 1965 0
## 5 0 0 3 8 156.07711 0.00000 1987 0
## 6 0 0 3 11 361.39283 142.14165 2001 0
## 7 0 0 3 7 159.32871 0.00000 1995 0
## 8 0 0 3 7 98.47722 0.00000 1963 0
## 9 0 0 3 7 97.54819 67.81922 1960 0
## zipcode lat long sqm_living15 sqm_lot15
## 1 98178 47.5112 -122.257 124.4901 514.03252
## 2 98125 47.7210 -122.319 157.0061 644.09678
## 3 98028 47.7379 -122.233 252.6963 676.05542
## 4 98136 47.5208 -122.393 126.3481 470.46099
## 5 98074 47.6168 -122.045 167.2255 632.94841
## 6 98053 47.6561 -122.005 442.2185 13.28513
## 7 98003 47.3097 -122.327 207.9170 587.24012
## 8 98198 47.4095 -122.315 153.2900 790.23326
## 9 98146 47.5123 -122.337 165.3674 680.60767
# Seleccionar només les variables que volguem
Houses_for_pca <- subset(Houses, select = - c(id, attribute_0))
(houses_pca <- prcomp(Houses_for_pca, scale = TRUE))
## Standard deviations (1, .., p=19):
## [1] 2.377625e+00 1.546918e+00 1.244706e+00 1.147317e+00 1.039631e+00
## [6] 1.010273e+00 9.495284e-01 9.349664e-01 9.111071e-01 8.097547e-01
## [11] 7.584549e-01 7.159767e-01 6.478718e-01 5.729237e-01 5.260024e-01
## [16] 4.865578e-01 4.396953e-01 4.273636e-01 5.447382e-15
##
## Rotation (n x k) = (19 x 19):
## PC1 PC2 PC3 PC4 PC5
## price 0.30202058 0.298714782 -0.062212848 0.06000036 -0.136785565
## bedrooms 0.23533956 0.073943300 0.189362063 -0.35354908 0.188577222
## bathrooms 0.35110915 -0.002150944 -0.043928175 -0.13514676 0.120946417
## sqm_living 0.38814146 0.120537670 0.093271118 -0.11588189 0.016982002
## sqm_lot 0.08014006 -0.063374400 0.258215404 0.28661828 -0.407975745
## floors 0.22309796 -0.192523280 -0.443013596 0.03179641 0.079355713
## waterfront 0.06077447 0.216014438 -0.007122435 0.57701574 0.284898070
## view 0.12886277 0.326489057 0.046662605 0.43637029 0.195368204
## condition -0.07043308 0.225965294 0.389845196 -0.12415168 -0.165847577
## grade 0.36289983 0.029757279 -0.113054440 -0.01685475 -0.051579213
## sqm_above 0.37677486 -0.076563224 -0.047912778 -0.01468944 0.008950074
## sqm_basement 0.10050529 0.393397783 0.283206544 -0.21299546 0.018495144
## yr_built 0.21790947 -0.387271025 -0.132456427 0.01911433 0.146108565
## yr_renovated 0.01316716 0.180586468 -0.063268118 0.18231485 0.097771897
## zipcode -0.13397813 0.328118588 -0.410637939 -0.10887227 0.045462923
## lat 0.02390311 0.242691441 -0.346109652 -0.20530022 -0.462442091
## long 0.15403740 -0.365791949 0.301728258 0.14659501 -0.186010491
## sqm_living15 0.34553607 0.026336592 0.093280020 0.01162947 -0.073067123
## sqm_lot15 -0.07037990 -0.014521050 0.165866754 -0.25034212 0.566905104
## PC6 PC7 PC8 PC9 PC10
## price 0.009695952 -0.21367051 0.1037508173 -0.029122700 0.077076006
## bedrooms -0.090844512 0.17374377 0.0027827083 0.131792452 -0.488036449
## bathrooms -0.003599118 0.19470451 -0.0185767715 0.050324195 -0.149648307
## sqm_living -0.042637934 0.01636435 -0.0686796186 0.021429639 0.046620494
## sqm_lot -0.221778135 -0.05761469 -0.6901011758 0.220987518 -0.214265401
## floors 0.008034394 0.02418795 0.0677668951 0.346466779 -0.157871023
## waterfront 0.205618720 -0.10668336 0.1309115246 -0.070884097 -0.535694472
## view 0.184235038 0.01280516 -0.1026331737 -0.053622654 0.273963553
## condition 0.224567178 -0.19206214 0.3578302903 0.554092140 -0.051134976
## grade 0.046604662 -0.07912441 -0.0071584991 0.011267230 0.241064550
## sqm_above -0.092964978 -0.20704132 0.0004714251 0.204420171 0.061289560
## sqm_basement 0.085461296 0.42134941 -0.1434074389 -0.338014184 -0.017929533
## yr_built 0.228514272 0.17375695 -0.0903969133 -0.163055098 -0.064162845
## yr_renovated -0.845284783 0.14123679 0.2770684326 -0.007254797 0.024504733
## zipcode 0.017876531 0.02170621 -0.3680044859 0.196122921 -0.005471683
## lat 0.014992231 -0.34465539 0.0844999967 -0.403496069 -0.339253378
## long -0.076599089 -0.12671029 0.1692768957 -0.288222116 -0.142711651
## sqm_living15 0.017348783 -0.14407228 0.0251327356 -0.097506149 0.305497144
## sqm_lot15 -0.149546681 -0.63306433 -0.2649728095 -0.135416584 -0.010679946
## PC11 PC12 PC13 PC14 PC15
## price -0.01587609 0.199658023 -0.228945055 -0.199217847 0.41171585
## bedrooms 0.44839278 -0.310191171 -0.156937967 0.225030211 -0.01246251
## bathrooms -0.30772989 -0.010159655 0.033366864 -0.013304226 0.27002146
## sqm_living 0.06243708 0.111683436 0.106119099 -0.190961079 0.04132992
## sqm_lot -0.13734523 0.039498459 -0.131007136 0.075390445 -0.07140419
## floors -0.27104903 -0.190656878 -0.140394110 -0.459186559 -0.41968330
## waterfront 0.08854052 0.349083795 0.134678419 0.034728031 -0.11840269
## view -0.04220131 -0.688186148 -0.162892352 0.064262871 0.07373766
## condition -0.37300567 -0.071797057 0.189209918 0.185845752 -0.04880691
## grade -0.07446169 0.187714319 -0.009057401 0.294651129 0.04167640
## sqm_above 0.22565459 0.057189271 0.082544309 -0.063895587 0.16439899
## sqm_basement -0.29264589 0.124762380 0.065773982 -0.276732968 -0.22183408
## yr_built -0.36025961 0.006205142 0.118365642 0.498751681 0.08059578
## yr_renovated -0.20992972 -0.022383135 0.095927123 0.205094899 -0.04260926
## zipcode 0.09644417 -0.088213414 0.674699108 0.009301181 0.12401563
## lat -0.12926000 -0.214842470 -0.091367471 0.168127636 -0.10220589
## long -0.01197953 -0.320415440 0.499569177 -0.311332663 0.16827412
## sqm_living15 0.23573501 0.039213631 0.211128623 0.174341945 -0.63978760
## sqm_lot15 -0.24763507 -0.007870628 -0.032988606 -0.051413265 -0.05374007
## PC16 PC17 PC18 PC19
## price -0.217386590 0.34834557 -0.509646570 -7.155817e-15
## bedrooms -0.235558002 0.12918510 -0.062134741 2.217223e-16
## bathrooms 0.619233485 0.39145906 0.270198414 -1.021713e-15
## sqm_living 0.081766178 -0.49125560 -0.013229746 -6.992604e-01
## sqm_lot -0.007370041 0.04536237 -0.017916288 -1.703924e-16
## floors -0.200728392 0.03360977 -0.038108423 -1.219782e-15
## waterfront 0.019860673 -0.02318549 0.093239664 -1.205997e-16
## view 0.039615475 -0.09325626 0.041406403 -6.911249e-17
## condition -0.020655213 -0.06040049 -0.054797757 -1.257580e-16
## grade -0.510423001 0.11105408 0.618615656 4.878886e-16
## sqm_above 0.181893210 -0.47937525 -0.006321435 6.304719e-01
## sqm_basement -0.170652924 -0.12251688 -0.015626765 3.369571e-01
## yr_built -0.121550571 -0.17624790 -0.443926175 1.954320e-16
## yr_renovated -0.043489137 -0.05509836 -0.077200491 -8.122276e-17
## zipcode -0.099050163 0.11309314 -0.082237251 -3.321866e-17
## lat 0.129513263 -0.16863937 0.074452279 5.914339e-17
## long -0.200651253 0.13422056 0.073643621 1.348441e-16
## sqm_living15 0.237402468 0.31489957 -0.210431995 -2.006462e-16
## sqm_lot15 -0.042100870 0.04092750 0.004723152 -1.195691e-16
fviz_pca_biplot(houses_pca, repel = TRUE, axes = c(1,2), col.ind = "#CDC5BF", col.var = "#7A67EE")
## Warning: ggrepel: 21612 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_eig(houses_pca, ylim = c(1, 100), addlabels = TRUE, barcolor = 1, barfill = "darkorange2",
main = "Scree Plot of nyc_PCA")
# The houses are placed in a map in order to properly visualize the dataset.
# We arbitrarily separate the houses in three distinct categories.
high_calif <- Houses$grade >= 9
med_calif <- (Houses$grade < 9 & Houses$grade > 4)
low_calif <- Houses$grade <= 4
Houses_HG <- subset(Houses, high_calif == TRUE) # High grade
Houses_MG <- subset(Houses, med_calif == TRUE) # Medium grade
Houses_LG <- subset(Houses, low_calif == FALSE) # Low grade
set_points <- data.frame( lat = c(min(Houses$lat) - 0.07, max(Houses$lat) + 0.07, max(Houses$lat) + 0.07,
min(Houses$lat) - 0.07, min(Houses$lat) - 0.07),
long = c(min(Houses$long) - 0.07, min(Houses$long) - 0.07, max(Houses$long) + 0.07,
max(Houses$long) + 0.07, min(Houses$long) - 0.07)) # Added extra lat and long in order to box observations better
m <- leaflet() %>%
addTiles() %>%
addCircleMarkers(data = Houses_LG, lng = Houses_LG$long, lat = Houses_LG$lat, popup = "Seattle",
radius = 1, color = "#FF1493", stroke = F, opacity = 0.5) %>%
addCircleMarkers(data = Houses_MG, lng = Houses_MG$long, lat = Houses_MG$lat, popup = "Seattle",
radius = 0.5, color = "black", stroke = F, opacity = 2) %>%
addCircleMarkers(data = Houses_HG, lng = Houses_HG$long, lat = Houses_HG$lat, popup = "Seattle",
radius = 0.5, color = "#00BFFF", stroke = F, opacity = 5) %>%
addPolylines(data = set_points, lng = ~long, lat = ~lat, weight = 3,
opacity = 6, col = "black")
addLegend(m, position = "topright", labels = c("High", "Medium","Low"), colors = c("#00BFFF", "black","#FF1493"),
title = "Grade qualification")
# Certain variables are transformed into factors to represent them correctly.
Houses$waterfront <- as.factor(Houses$waterfront)
Houses$view <- as.factor(Houses$view)
Houses$condition <- factor(Houses$condition)
Houses$floors <- as.factor(Houses$floors)
# A new variable is created using already existent ones.
Houses$age <- 2015 - apply(dplyr::select(Houses, yr_built, yr_renovated), FUN = max, MARGIN = 1) # This new variable represents the years passed with no modifications done to the house.
# The data is separated into a training dataset and a testing dataset.
set.seed(18) # This line is for reproducibility
sample <- sample.split(Houses, SplitRatio = 0.8)
Houses_tr <- subset(Houses, sample == TRUE)
Houses_te <- subset(Houses, sample == FALSE)
# We check if there is NA values.
sample3 <- is.na(Houses_tr)
table(sample3)
## sample3
## FALSE
## 367422
We observe no NA values defined as such.
We’ll study if variables’ values are coherent.
# Search of missing values on numerical variables
cat('\n', "Bedrooms")
##
## Bedrooms
table(Houses_tr[,"bedrooms"] == 0)
##
## FALSE TRUE
## 16689 12
cat('\n', "Bathrooms")
##
## Bathrooms
table(Houses_tr[,"bathrooms"] == 0)
##
## FALSE TRUE
## 16692 9
cat('\n', "Sqm_living")
##
## Sqm_living
table(Houses_tr[,"sqm_living"] == 0)
##
## FALSE
## 16701
cat('\n', "Sqm_lot")
##
## Sqm_lot
table(Houses_tr[,"sqm_lot"] == 0)
##
## FALSE
## 16701
cat('\n', "Floors")
##
## Floors
table(Houses_tr[,"floors"] == 0)
##
## FALSE
## 16701
cat('\n', "Sqm_above")
##
## Sqm_above
table(Houses_tr[,"sqm_above"] == 0)
##
## FALSE
## 16701
cat('\n', "Sqm_basement")
##
## Sqm_basement
table(Houses_tr[,"sqm_basement"] == 0)
##
## FALSE TRUE
## 6535 10166
cat('\n', "Yr_built")
##
## Yr_built
table(Houses_tr[,"yr_built"] == 0)
##
## FALSE
## 16701
cat('\n', "Yr_renovated")
##
## Yr_renovated
table(Houses_tr[,"yr_renovated"] == 0)
##
## FALSE TRUE
## 722 15979
cat('\n', "Zipcode")
##
## Zipcode
table(Houses_tr[,"zipcode"] == 0)
##
## FALSE
## 16701
cat('\n', "Lat")
##
## Lat
table(Houses_tr[,"lat"] == 0)
##
## FALSE
## 16701
cat('\n', "Long")
##
## Long
table(Houses_tr[,"long"] == 0)
##
## FALSE
## 16701
cat('\n', "Sqm_living15")
##
## Sqm_living15
table(Houses_tr[,"sqm_living15"] == 0)
##
## FALSE
## 16701
cat('\n', "Sqm_lot15")
##
## Sqm_lot15
table(Houses_tr[,"sqm_lot15"] == 0)
##
## FALSE
## 16701
We observe houses containing \(0\) bedrooms and/or bathrooms, which initially does no make sense.
# We check the houses with a incoherent value for bathrooms and/or bedrooms.
sample1 <- Houses_tr$bedrooms == 0
(House_no_bed <- subset(Houses_tr, sample1 == TRUE))
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 876 875 6306400140 1095000 0 0.00 284.65491 442.59008
## 3120 3119 3918400017 380000 0 0.00 136.56747 90.95208
## 3468 3467 1453602309 288000 0 1.50 132.85135 153.29002
## 4869 4868 6896300380 228000 0 1.00 36.23219 548.12794
## 6995 6994 2954400190 1295650 0 0.00 446.86362 2602.02834
## 8478 8477 2569500210 339950 0 2.50 212.74796 772.86039
## 9774 9773 3374500520 355000 0 0.00 228.54148 747.77657
## 9855 9854 7849202190 235000 0 0.00 136.56747 445.93459
## 12654 12653 7849202299 320000 0 2.50 138.42553 660.63352
## 14424 14423 9543000205 139950 0 0.00 78.41017 396.60308
## 18380 18379 1222029077 265000 0 0.75 35.67477 19829.59647
## 19453 19452 3980300371 142000 0 0.00 26.94188 1939.35096
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 876 3.5 0 2 3 7 284.65491 0 1990
## 3120 3 0 2 3 8 136.56747 0 2006
## 3468 3 0 0 3 7 132.85135 0 1999
## 4869 1 0 0 2 4 36.23219 0 1953
## 6995 2 0 0 3 12 446.86362 0 1990
## 8478 2 0 0 3 8 212.74796 0 1985
## 9774 2 0 0 3 8 228.54148 0 1990
## 9855 2 0 0 3 7 136.56747 0 1996
## 12654 2 0 0 3 7 138.42553 0 1999
## 14424 1 0 0 4 7 78.41017 0 1913
## 18380 1 0 0 3 4 35.67477 0 2003
## 19453 1 0 0 1 1 26.94188 0 1963
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 876 0 98102 47.6362 -122.322 219.25117 403.1063 25
## 3120 0 98133 47.7145 -122.356 136.56747 156.7274 9
## 3468 0 98125 47.7222 -122.290 132.85135 211.3544 16
## 4869 0 98118 47.5260 -122.261 201.59960 536.4222 62
## 6995 0 98053 47.6642 -122.069 440.36041 355.9115 25
## 8478 0 98042 47.3473 -122.151 232.25760 728.7314 30
## 9774 0 98031 47.4095 -122.168 234.11566 675.1264 25
## 9855 0 98065 47.5265 -121.828 98.47722 610.3730 19
## 12654 0 98065 47.5261 -121.826 139.35456 450.9514 16
## 14424 0 98001 47.2781 -122.250 128.20620 781.9649 102
## 18380 0 98070 47.4177 -122.491 178.37384 284.5620 12
## 19453 0 98024 47.5308 -121.888 150.50292 287.4420 52
sample2 <- Houses_tr$bathrooms == 0
(House_no_bath <- subset(Houses_tr, sample2 == TRUE))
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 876 875 6306400140 1095000 0 0 284.65491 442.59008
## 1150 1149 3421079032 75000 1 0 62.24504 4029.85517
## 3120 3119 3918400017 380000 0 0 136.56747 90.95208
## 6995 6994 2954400190 1295650 0 0 446.86362 2602.02834
## 9774 9773 3374500520 355000 0 0 228.54148 747.77657
## 9855 9854 7849202190 235000 0 0 136.56747 445.93459
## 10482 10481 203100435 484000 1 0 64.10310 2159.43826
## 14424 14423 9543000205 139950 0 0 78.41017 396.60308
## 19453 19452 3980300371 142000 0 0 26.94188 1939.35096
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 876 3.5 0 2 3 7 284.65491 0 1990
## 1150 1 0 0 3 3 62.24504 0 1966
## 3120 3 0 2 3 8 136.56747 0 2006
## 6995 2 0 0 3 12 446.86362 0 1990
## 9774 2 0 0 3 8 228.54148 0 1990
## 9855 2 0 0 3 7 136.56747 0 1996
## 10482 1 0 0 4 7 64.10310 0 1948
## 14424 1 0 0 4 7 78.41017 0 1913
## 19453 1 0 0 1 1 26.94188 0 1963
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 876 0 98102 47.6362 -122.322 219.25117 403.1063 25
## 1150 0 98022 47.2638 -121.906 107.76753 425.3101 49
## 3120 0 98133 47.7145 -122.356 136.56747 156.7274 9
## 6995 0 98053 47.6642 -122.069 440.36041 355.9115 25
## 9774 0 98031 47.4095 -122.168 234.11566 675.1264 25
## 9855 0 98065 47.5265 -121.828 98.47722 610.3730 19
## 10482 0 98053 47.6429 -121.955 157.00614 248.3298 67
## 14424 0 98001 47.2781 -122.250 128.20620 781.9649 102
## 19453 0 98024 47.5308 -121.888 150.50292 287.4420 52
As no NANs are found, we transform the zero values of bedrooms and bathrooms into NANs. By this, we’ll be able to impute them a value based on their neighbours
Houses_tr$bedrooms[Houses_tr$bedrooms == 0] <- NA
Houses_tr$bathrooms[Houses_tr$bathrooms == 0] <- NA
summary(Houses_tr)
## attribute_0 id price bedrooms
## Min. : 1 Min. :1.000e+06 Min. : 75000 Min. : 1.000
## 1st Qu.: 5403 1st Qu.:2.120e+09 1st Qu.: 322500 1st Qu.: 3.000
## Median :10807 Median :3.905e+09 Median : 450000 Median : 3.000
## Mean :10806 Mean :4.575e+09 Mean : 538594 Mean : 3.371
## 3rd Qu.:16209 3rd Qu.:7.300e+09 3rd Qu.: 642000 3rd Qu.: 4.000
## Max. :21612 Max. :9.900e+09 Max. :7062500 Max. :33.000
## NA's :12
## bathrooms sqm_living sqm_lot floors waterfront
## Min. :0.500 Min. : 26.94 Min. : 48.31 1 :8244 0:16572
## 1st Qu.:1.750 1st Qu.: 131.92 1st Qu.: 467.30 1.5:1504 1: 129
## Median :2.250 Median : 177.44 Median : 706.06 2 :6351
## Mean :2.113 Mean : 192.79 Mean : 1390.08 2.5: 123
## 3rd Qu.:2.500 3rd Qu.: 236.90 3rd Qu.: 992.39 3 : 474
## Max. :8.000 Max. :1257.91 Max. :153416.27 3.5: 5
## NA's :9
## view condition grade sqm_above sqm_basement
## 0:15063 1: 22 Min. : 1.000 Min. : 26.94 Min. : 0.00
## 1: 254 2: 141 1st Qu.: 7.000 1st Qu.:110.55 1st Qu.: 0.00
## 2: 741 3:10798 Median : 7.000 Median :144.93 Median : 0.00
## 3: 387 4: 4406 Mean : 7.649 Mean :165.91 Mean : 26.87
## 4: 256 5: 1334 3rd Qu.: 8.000 3rd Qu.:204.39 3rd Qu.: 52.03
## Max. :13.000 Max. :874.22 Max. :383.69
##
## yr_built yr_renovated zipcode lat
## Min. :1900 Min. : 0.00 Min. :98001 Min. :47.16
## 1st Qu.:1951 1st Qu.: 0.00 1st Qu.:98033 1st Qu.:47.47
## Median :1975 Median : 0.00 Median :98065 Median :47.57
## Mean :1971 Mean : 86.29 Mean :98078 Mean :47.56
## 3rd Qu.:1997 3rd Qu.: 0.00 3rd Qu.:98118 3rd Qu.:47.68
## Max. :2015 Max. :2015.00 Max. :98199 Max. :47.78
##
## long sqm_living15 sqm_lot15 age
## Min. :-122.5 Min. : 37.07 Min. : 0.0929 Min. : 0.0
## 1st Qu.:-122.3 1st Qu.:137.50 1st Qu.:294.2239 1st Qu.: 16.0
## Median :-122.2 Median :170.94 Median :481.7023 Median : 38.0
## Mean :-122.2 Mean :184.47 Mean :455.2054 Mean : 41.8
## 3rd Qu.:-122.1 3rd Qu.:220.18 3rd Qu.:641.4955 3rd Qu.: 61.0
## Max. :-121.3 Max. :537.91 Max. :807.2345 Max. :115.0
##
Bath <- data.frame(
atr <- which(is.na(Houses_tr$bathrooms)),
s <- Houses_tr$sqm_living[atr],
Floors <- Houses_tr$floors[atr]
)
ggplot(Bath, aes(x = atr, y = s, color = Floors)) +
geom_point(show.legend = TRUE, shape=18, size=3) +
xlab('attribute_0') +
ylab('sqm_living') +
labs(title = "MISSING VALUES OF BATHROOMS") +
scale_color_manual(values=c('#030303','#FF4500','#4EEE94', '#FFE7BA')) +
theme_minimal()+theme(panel.background = element_rect(fill = "gray"),
legend.position="bottom")
Bed <- data.frame(
atr <- which(is.na(Houses_tr$bedrooms)),
s <- Houses_tr$sqm_living[atr],
Floors <- Houses_tr$floors[atr]
)
ggplot(Bed, aes(x = atr, y = s, color = Floors)) +
geom_point(show.legend = TRUE, shape=18, size=3) +
xlab('attribute_0') +
ylab('sqm_living') +
labs(title = "MISSING VALUES OF BEDROOMS") +
scale_color_manual(values=c('#030303','#FF4500','#4EEE94', '#FFE7BA')) +
theme_minimal()+theme(panel.background = element_rect(fill = "gray"),
legend.position="bottom")
# New values wil be assign to the missing values through the K-nn algorithm
names_columns <- colnames(Houses_tr)
var_to_use_knn <- names_columns[names_columns != c("attribute_0", "id")] # We don't want certain variables to be used for knn method
houses_knn <- kNN(Houses_tr, variable = c("bedrooms", "bathrooms"), dist_var = var_to_use_knn, k = 129, imp_var = FALSE) #imp_var avoids the creation of variables showing imputation status
Houses_tr <- houses_knn
summary(Houses_tr)
## attribute_0 id price bedrooms
## Min. : 1 Min. :1.000e+06 Min. : 75000 Min. : 1.00
## 1st Qu.: 5403 1st Qu.:2.120e+09 1st Qu.: 322500 1st Qu.: 3.00
## Median :10807 Median :3.905e+09 Median : 450000 Median : 3.00
## Mean :10806 Mean :4.575e+09 Mean : 538594 Mean : 3.37
## 3rd Qu.:16209 3rd Qu.:7.300e+09 3rd Qu.: 642000 3rd Qu.: 4.00
## Max. :21612 Max. :9.900e+09 Max. :7062500 Max. :33.00
## bathrooms sqm_living sqm_lot floors waterfront
## Min. :0.500 Min. : 26.94 Min. : 48.31 1 :8244 0:16572
## 1st Qu.:1.750 1st Qu.: 131.92 1st Qu.: 467.30 1.5:1504 1: 129
## Median :2.250 Median : 177.44 Median : 706.06 2 :6351
## Mean :2.113 Mean : 192.79 Mean : 1390.08 2.5: 123
## 3rd Qu.:2.500 3rd Qu.: 236.90 3rd Qu.: 992.39 3 : 474
## Max. :8.000 Max. :1257.91 Max. :153416.27 3.5: 5
## view condition grade sqm_above sqm_basement
## 0:15063 1: 22 Min. : 1.000 Min. : 26.94 Min. : 0.00
## 1: 254 2: 141 1st Qu.: 7.000 1st Qu.:110.55 1st Qu.: 0.00
## 2: 741 3:10798 Median : 7.000 Median :144.93 Median : 0.00
## 3: 387 4: 4406 Mean : 7.649 Mean :165.91 Mean : 26.87
## 4: 256 5: 1334 3rd Qu.: 8.000 3rd Qu.:204.39 3rd Qu.: 52.03
## Max. :13.000 Max. :874.22 Max. :383.69
## yr_built yr_renovated zipcode lat
## Min. :1900 Min. : 0.00 Min. :98001 Min. :47.16
## 1st Qu.:1951 1st Qu.: 0.00 1st Qu.:98033 1st Qu.:47.47
## Median :1975 Median : 0.00 Median :98065 Median :47.57
## Mean :1971 Mean : 86.29 Mean :98078 Mean :47.56
## 3rd Qu.:1997 3rd Qu.: 0.00 3rd Qu.:98118 3rd Qu.:47.68
## Max. :2015 Max. :2015.00 Max. :98199 Max. :47.78
## long sqm_living15 sqm_lot15 age
## Min. :-122.5 Min. : 37.07 Min. : 0.0929 Min. : 0.0
## 1st Qu.:-122.3 1st Qu.:137.50 1st Qu.:294.2239 1st Qu.: 16.0
## Median :-122.2 Median :170.94 Median :481.7023 Median : 38.0
## Mean :-122.2 Mean :184.47 Mean :455.2054 Mean : 41.8
## 3rd Qu.:-122.1 3rd Qu.:220.18 3rd Qu.:641.4955 3rd Qu.: 61.0
## Max. :-121.3 Max. :537.91 Max. :807.2345 Max. :115.0
# We choose k = 129 as it is sqrt(SIZE) of our training data.
# We plot the values for each variable in order to visually find outliers.
# Price
ggplot(Houses_tr, aes(attribute_0, price)) +
geom_point(color = "#AB82FF") +
ylab("price ($)") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$price, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 3025 3914 9808700762 7062500 5 4.50 932.7465 3467.606
## 1017 1315 7558700030 5300000 6 6.00 686.5535 2306.690
## 900 1164 1247600105 5110800 5 5.25 744.1534 4228.668
## 2030 2626 7738500731 4500000 5 5.50 616.8762 3717.422
## 9559 12370 6065300370 4208000 5 6.00 691.1986 2001.131
## 3207 4149 6447300265 4000000 4 5.50 657.7535 1539.682
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 3025 2 1 2 3 11 713.4953 219.25117 1940
## 1017 2 1 4 4 12 464.5152 222.03827 1991
## 900 2 1 4 3 12 556.4892 187.66414 1999
## 2030 2 1 4 3 12 589.9343 26.94188 2004
## 9559 2 0 0 3 12 515.6119 175.58675 2003
## 3207 2 0 0 3 12 535.1215 122.63201 2008
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 3025 2001 98004 47.6500 -122.214 365.1089 302.4923 14
## 1017 0 98040 47.5631 -122.210 401.3411 297.3826 24
## 900 0 98033 47.6767 -122.211 318.6574 310.8536 16
## 2030 0 98155 47.7493 -122.280 281.4962 291.1581 11
## 9559 0 98006 47.5692 -122.189 440.3604 248.4227 12
## 3207 0 98039 47.6151 -122.224 291.7155 202.4357 7
# Bedrooms
ggplot(Houses_tr, aes(attribute_0, bedrooms)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$bedrooms, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 12264 15870 2402100895 640000 33 1.75 150.5029 557.4182
## 6767 8757 1773100755 520000 11 3.00 278.7091 460.7991
## 10288 13314 627300145 1148000 10 5.25 426.4250 1014.5012
## 11715 15161 5566100170 650000 10 2.00 335.3800 1106.8468
## 14878 19254 8812401450 660000 10 3.00 271.2769 347.9219
## 3165 4096 1997200215 599999 9 4.50 355.8186 649.2064
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 12264 1 0 0 5 7 96.61916 53.88376 1947
## 6767 2 0 0 3 7 222.96730 55.74182 1918
## 10288 1 0 2 3 9 232.25760 194.16735 2008
## 11715 2 0 0 4 7 279.63815 55.74182 1958
## 14878 2 0 0 4 7 172.79965 98.47722 1913
## 3165 2.5 0 0 3 7 227.61245 128.20620 1938
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 12264 0 98103 47.6878 -122.331 123.5610 452.90232 68
## 6767 1999 98106 47.5560 -122.363 131.9223 468.23132 16
## 10288 0 98004 47.5861 -122.113 253.6253 24.06189 7
## 11715 0 98006 47.5705 -122.175 189.5222 92.25272 57
## 14878 0 98105 47.6635 -122.320 168.1545 382.76052 102
## 3165 0 98103 47.6927 -122.338 135.6384 555.37437 77
# Bathrooms
ggplot(Houses_tr, aes(attribute_0, bathrooms)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$bathrooms, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 9874 12777 1225069038 2280000 7 8.00 1257.9072 28591.0964
## 6605 8546 424049043 450000 9 7.50 376.2573 604.2414
## 3110 4024 9175600025 800000 7 6.75 694.9147 3870.7123
## 15902 20578 424069279 1180000 6 6.50 581.5730 1017.7528
## 16619 21506 2524069097 2238890 5 6.50 675.4051 12078.9746
## 14143 18302 6072800246 3300000 5 6.25 745.0824 2019.5263
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874 3 0 4 3 12 874.2176 383.68956 1999
## 6605 2 0 0 3 7 376.2573 0.00000 1996
## 3110 2 0 2 3 11 471.9474 222.96730 1953
## 15902 2 0 0 3 11 449.6507 131.92232 2007
## 16619 2 0 0 3 12 596.4375 78.96758 2010
## 14143 2 0 0 3 11 745.0824 0.00000 2001
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 9874 0 98053 47.6675 -121.986 450.5797 277.1298 16
## 6605 0 98144 47.5923 -122.301 134.5236 393.3515 19
## 3110 0 98166 47.4643 -122.368 261.0575 343.3696 62
## 15902 0 98075 47.5947 -122.039 251.7672 116.5004 8
## 16619 0 98027 47.5371 -121.982 167.2255 438.3165 5
## 14143 0 98006 47.5675 -122.189 386.4766 244.9853 14
# Sqm_living
ggplot(Houses_tr, aes(attribute_0, sqm_living)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$sqm_living, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 9874 12777 1225069038 2280000 7 8.00 1257.9072 28591.096
## 3025 3914 9808700762 7062500 5 4.50 932.7465 3467.606
## 14143 18302 6072800246 3300000 5 6.25 745.0824 2019.526
## 900 1164 1247600105 5110800 5 5.25 744.1534 4228.668
## 10364 13411 2426039123 2415000 5 4.75 732.0760 2252.899
## 12962 16773 1630700380 1920000 5 5.75 718.1405 21448.339
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874 3 0 4 3 12 874.2176 383.68956 1999
## 3025 2 1 2 3 11 713.4953 219.25117 1940
## 14143 2 0 0 3 11 745.0824 0.00000 2001
## 900 2 1 4 3 12 556.4892 187.66414 1999
## 10364 2 0 2 3 13 732.0760 0.00000 1996
## 12962 2 0 0 3 12 618.7342 99.40625 2004
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 9874 0 98053 47.6675 -121.986 450.5797 277.12977 16
## 3025 2001 98004 47.6500 -122.214 365.1089 302.49230 14
## 14143 0 98006 47.5675 -122.189 386.4766 244.98532 14
## 900 0 98033 47.6767 -122.211 318.6574 310.85357 16
## 10364 0 98177 47.7334 -122.362 254.5543 43.10701 19
## 12962 0 98077 47.7615 -122.084 247.1221 397.62501 11
# Sqm_lot
ggplot(Houses_tr, aes(attribute_0, sqm_lot)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$sqm_lot, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 1328 1719 1020069017 700000 4 1.00 120.77395 153416.27
## 13383 17319 3326079016 190000 2 1.00 65.96116 108212.90
## 5910 7647 2623069031 542500 5 3.25 279.63815 99798.12
## 6003 7769 2323089009 855000 4 3.50 374.39925 95139.03
## 3432 4441 3626079040 790000 2 3.00 237.83178 91256.61
## 5170 6691 2624089007 1998000 2 2.50 362.32186 85510.09
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 1328 1 0 3 4 6 120.77395 0.00000 1920
## 13383 1 0 0 2 5 65.96116 0.00000 1915
## 5910 1.5 0 0 5 8 186.73511 92.90304 1931
## 6003 2 0 0 3 10 374.39925 0.00000 2006
## 3432 1 0 0 3 8 237.83178 0.00000 2004
## 5170 2 0 0 3 12 362.32186 0.00000 2009
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 1328 0 98022 47.2313 -122.023 237.8318 422.52303 95
## 13383 0 98014 47.6888 -121.909 156.0771 214.79183 100
## 5910 0 98027 47.4564 -122.004 227.6124 591.04914 84
## 6003 0 98045 47.4619 -121.744 170.0126 83.89145 9
## 3432 0 98014 47.6955 -121.861 150.5029 411.09595 11
## 5170 0 98065 47.5371 -121.756 252.6963 413.04692 6
# Sqm_above
ggplot(Houses_tr, aes(attribute_0, sqm_above)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$sqm_above, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 9874 12777 1225069038 2280000 7 8.00 1257.9072 28591.096
## 14143 18302 6072800246 3300000 5 6.25 745.0824 2019.526
## 10364 13411 2426039123 2415000 5 4.75 732.0760 2252.899
## 3025 3914 9808700762 7062500 5 4.50 932.7465 3467.606
## 9174 11871 8835800350 1950000 4 3.25 689.3406 15595.540
## 14368 18594 3023069166 1135250 5 4.00 680.0503 20234.282
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874 3 0 4 3 12 874.2176 383.6896 1999
## 14143 2 0 0 3 11 745.0824 0.0000 2001
## 10364 2 0 2 3 13 732.0760 0.0000 1996
## 3025 2 1 2 3 11 713.4953 219.2512 1940
## 9174 2 0 3 3 12 689.3406 0.0000 2002
## 14368 2 0 0 3 11 680.0503 0.0000 1992
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 9874 0 98053 47.6675 -121.986 450.5797 277.12977 16
## 14143 0 98006 47.5675 -122.189 386.4766 244.98532 14
## 10364 0 98177 47.7334 -122.362 254.5543 43.10701 19
## 3025 2001 98004 47.6500 -122.214 365.1089 302.49230 14
## 9174 0 98045 47.4548 -121.764 521.1861 217.95053 13
## 14368 0 98058 47.4473 -122.086 303.7929 350.80188 23
# Sqm_basement
ggplot(Houses_tr, aes(attribute_0, sqm_basement)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$sqm_basement, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 9874 12777 1225069038 2280000 7 8.00 1257.9072 28591.096
## 11964 15482 624069108 3200000 4 3.25 650.3213 2620.423
## 7794 10085 7767000060 1900000 5 4.25 604.7988 1530.206
## 16493 21344 8835770170 1488000 5 6.00 639.1729 26009.878
## 5437 7035 853200010 3800000 5 5.50 654.9664 3979.966
## 5122 6628 3322049005 850000 4 2.75 505.3925 22257.710
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874 3 0 4 3 12 874.2176 383.6896 1999
## 11964 1 1 4 4 12 325.1606 325.1606 1991
## 7794 2 0 3 4 11 301.9349 302.8639 1980
## 16493 2 0 3 3 12 378.1154 261.0575 2007
## 5437 1 0 2 4 13 401.3411 253.6253 1978
## 5122 1 0 0 2 9 252.6963 252.6963 1969
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 9874 0 98053 47.6675 -121.986 450.5797 277.1298 16
## 11964 0 98075 47.5928 -122.086 456.4326 173.9145 24
## 7794 0 98040 47.5758 -122.242 416.2056 210.9828 35
## 16493 0 98045 47.4624 -121.779 435.7153 304.2575 8
## 5437 0 98004 47.6229 -122.220 471.0184 264.6808 37
## 5122 0 98001 47.3540 -122.293 183.0190 405.8934 46
# Yr_built
ggplot(Houses_tr, aes(attribute_0, age)) +
geom_point(color = "#AB82FF") +
geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")
head(Houses_tr[order(Houses_tr$yr_built, decreasing = TRUE),])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 497 643 9385200045 729500 3 2.50 154.21905 101.3572
## 1362 1763 1832100030 597326 4 4.00 331.66385 766.4501
## 2076 2687 3076500830 385195 1 1.00 65.96116 557.4182
## 6213 8039 1250200495 455000 2 1.50 111.48365 116.9649
## 6511 8425 558100090 628000 5 2.75 241.54790 758.0888
## 10984 14215 8156600210 1285000 5 3.50 276.85106 473.8055
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 497 3 0 1 3 9 142.14165 12.07740 2015
## 1362 2 0 0 3 10 265.70269 65.96116 2015
## 2076 1.5 0 0 3 6 65.96116 0.00000 2015
## 6213 2 0 0 3 8 92.90304 18.58061 2015
## 6511 2 0 0 3 8 241.54790 0.00000 2015
## 10984 2 0 0 3 10 220.18020 56.67085 2015
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 497 0 98116 47.5818 -122.402 140.2836 144.92874240 0
## 1362 0 98040 47.5784 -122.226 207.1738 0.09290304 0
## 2076 0 98144 47.5756 -122.316 133.7804 458.19779328 0
## 6213 0 98144 47.6001 -122.298 122.6320 239.41113408 0
## 6511 0 98133 47.7348 -122.340 148.6449 684.41669568 0
## 10984 0 98115 47.6782 -122.299 165.3674 477.89323776 0
House \('1225069038'\) has a very large dimension on sqm_living, was built in 1999 but is considerably cheaper than other houses. We will delete this individual.
# Elimination of outlier.
Houses_tr <- Houses_tr[!Houses_tr$id == 1225069038,]
if(! any(Houses_tr$id == 1225069038)){
print("Deleted successfully")
}else{
print("Error!")
}
## [1] "Deleted successfully"
# Manual imputation of bedrooms due to human error.
Houses_tr$bedrooms[Houses_tr$id == 2402100895] <- 3
(Houses_tr[Houses_tr$id == 2402100895,])
## attribute_0 id price bedrooms bathrooms sqm_living sqm_lot
## 12264 15870 2402100895 640000 3 1.75 150.5029 557.4182
## floors waterfront view condition grade sqm_above sqm_basement yr_built
## 12264 1 0 0 5 7 96.61916 53.88376 1947
## yr_renovated zipcode lat long sqm_living15 sqm_lot15 age
## 12264 0 98103 47.6878 -122.331 123.561 452.9023 68
# We search for outliers using z score.
# We create a separate dataset to work on.
Houses_z <- Houses_tr
Houses_z$price_z_score <- abs(scale(Houses_z$price, center = TRUE, scale = TRUE))
Houses_z$bedrooms_z_score <- abs(scale(Houses_z$bedrooms, center = TRUE, scale = TRUE))
Houses_z$bathrooms_z_score <- abs(scale(Houses_z$bathrooms, center = TRUE, scale = TRUE))
Houses_z$living_z_score <- abs(scale(Houses_z$sqm_living, center = TRUE, scale = TRUE))
Houses_z$lot_z_score <- abs(scale(Houses_z$sqm_lot, center = TRUE, scale = TRUE))
Houses_z$above_z_score <- abs(scale(Houses_z$sqm_above, center = TRUE, scale = TRUE))
Houses_z$age_z_score <- abs(scale(Houses_z$age, center = TRUE, scale = TRUE))
collist <- c(23, 24, 25, 26, 27, 28, 29)
num <- vector()
for (row in 1:length(Houses_z[,1]))
{
sum <- 0
for (element in collist)
{
if (Houses_z[row,element] > 3){ # If the z_score value is higher than 3, we will count that as an outlier.
sum <- sum + 1
}
}
num <- c(num, sum)
}
Houses_z$num <- num
# We will consider outliers those houses with a number higher of 3.
outlier_id <- Houses_z$id[Houses_z$num > 3]
Houses_tr <- Houses_tr[!(Houses_tr$id %in% outlier_id),]
# We transform the data for price in order to obtain gaussianity.
h <- hist(Houses_tr$price, xlab = "price ($)", main = "Histogram of price",
breaks = 25, ylim = c(0, 6000))
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(Houses_tr$price, col = "#87CEFA", breaks = 25, ylim = c(0, 6000), add = TRUE)
text(h$mids, h$counts, labels = h$counts, adj=c(0.5, -0.5), cex=0.55)
h <- hist(log(Houses_tr$price), xlab = "log(price)" , main = "Histogram of log(price)",
ylim = c(0,1), breaks = 15, prob = TRUE, col = "#87CEFA")
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(Houses_tr$price, ylim = c(0,1), col = "#87CEFA", add = TRUE, breaks = 15, prob = TRUE)
curve(dnorm(x, mean(log(Houses_tr$price)), sd(log(Houses_tr$price))), col = "#EE6AA7", add = TRUE, lwd = 1.75)
Houses_tr$price = (log(Houses_tr$price)) # Apply it for the training set
Houses_te$price = (log(Houses_te$price)) # Apply it for the test set
# We transform the data for sqm_living in order to obtain gaussianity.
h <- hist(Houses_tr$sqm_living, xlab = "Squared metres of living", main = "Histogram of sqm_living",
breaks = 25, ylim = c(0, 6000))
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(Houses_tr$sqm_living, col = "#87CEFA", breaks = 25, ylim = c(0, 6000), add = TRUE)
text(h$mids, h$counts, labels = h$counts, adj=c(0.5, -0.5), cex=0.55)
h <- hist(log(Houses_tr$sqm_living), xlab = "log(Squared metres of living)", main = "Histogram of log(sqm_living)",
xlim = c(3,8), ylim = c(0,1), breaks = 15, prob = TRUE)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(log(Houses_tr$sqm_living), ylim = c(0,1), col = "#87CEFA", add = TRUE, breaks = 15, prob = TRUE)
curve(dnorm(x, mean(log(Houses_tr$sqm_living)), sd(log(Houses_tr$sqm_living))), col = "#EE6AA7", add = TRUE, lwd = 1.75)
Houses_tr$sqm_living = (log(Houses_tr$sqm_living)) # Apply it for the training set.
Houses_te$sqm_living = (log(Houses_te$sqm_living)) # Apply it for the test set.
# We transform the data for sqm_lot in order to obtain gaussianity.
hist(Houses_tr$sqm_lot, main="Squared metres of lot")
h <- hist(log(Houses_tr$sqm_lot), xlab = "sqm_lot_transf", main = "Transformed squared metres of lot",
ylim = c(0,5), breaks = 30, prob = TRUE)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(log(Houses_tr$sqm_lot), ylim = c(0,5), col = "#87CEFA", add = TRUE, breaks = 30, prob = TRUE, xlab = "sqm_lot")
curve(dnorm(x, mean(log(Houses_tr$sqm_lot)), sd(log(Houses_tr$sqm_lot))), col = "#EE6AA7", add = TRUE, lwd = 1.75)
Houses_tr$sqm_lot = (log(Houses_tr$sqm_lot)) # Apply it for the training set.
Houses_te$sqm_lot = (log(Houses_te$sqm_lot)) # Apply it for the test set.
# We transform the data for sqm_above in order to obtain gaussianity.
hist(Houses_tr$sqm_above, main="Squared metres above")
bx = boxcox(I(sqm_above) ~ . - id - attribute_0, data = Houses_tr,
lambda = seq(-0.5, 0.5, length = 10))
lambda = bx$x[which.max(bx$y)]
sprintf("The value of lambda used is: %f", lambda)
## [1] "The value of lambda used is: 0.095960"
sqm_above_transf = (Houses_tr$sqm_above^lambda - 1)/lambda
h <- hist(sqm_above_transf, xlab = "sqm_above", main = "Squared metres above",
ylim = c(0,1), breaks = 30, prob = TRUE)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist((sqm_above_transf), ylim = c(0,1), col = "#87CEFA", add = TRUE, breaks = 30, prob = TRUE)
curve(dnorm(x, mean(sqm_above_transf), sd(sqm_above_transf)), col = "#EE6AA7", add = TRUE, lwd = 1.75)
As it doesn’t correct the skewness totally, we decide not to apply it.
# We transform the data for sqm_above in order to obtain gaussianity.
hist(Houses_tr$sqm_basement, main="Squared metres basement", breaks = 30, col = "#87CEFA", xlab = "sqm_basement")
It is already Gaussian.
# We transform the data for sqm_living15 in order to obtain gaussianity.
hist(Houses_tr$sqm_living15, main="Squared metres15", breaks = 30)
bx = boxcox(I(Houses_tr$sqm_living15) + 1 ~ . - id - attribute_0, data = Houses_tr,
lambda = seq(-0.25, 0.25, length = 10)) # if we don't add 1, we would compute negative logarithms
lambda = bx$x[which.max(bx$y)]
sprintf("The value of lambda used is: %f", lambda)
## [1] "The value of lambda used is: 0.148990"
sqm_living15_transf = (Houses_tr$sqm_living15^lambda - 1)/lambda
h <- hist(sqm_living15_transf, xlab = "sqm_living15_transf", main = "Squared metres of living15",
breaks = 30)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist((sqm_living15_transf),col = "#87CEFA", add = TRUE, breaks = 30)
We don’t apply the Box Cox Transformation.
# We transform the data for sqm_lot15 in order to obtain gaussianity.
hist(Houses_tr$sqm_lot15, main="Squared metres15", breaks = 30)
bx = boxcox(I(Houses_tr$sqm_lot15) + 1 ~ . - id - attribute_0, data = Houses_tr,
lambda = seq(-1, 1, length = 10)) # if we don't add 1, we would compute negative logarithms
lambda = bx$x[which.max(bx$y)]
sprintf("The value of lambda used is: %f", lambda)
## [1] "The value of lambda used is: 1.000000"
sqm_lot15_transf = (Houses_tr$sqm_lot15^lambda - 1)/lambda
h <- hist(sqm_lot15_transf + 1, xlab = "sqm_lot15_transf", main = "Squared metres of lot15",
breaks = 30)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist((sqm_lot15_transf + 1),col = "#87CEFA", add = TRUE, breaks = 30)
We do not apply any transformation.
# We download the processed data to work in python
write.csv(Houses_tr,"Processed_tr.csv", row.names = FALSE)
write.csv(Houses_te,"Processed_te.csv", row.names = FALSE)
# Correlation plot of numerical variables
Houses_smth <- subset(Houses_tr, select = - c(id, attribute_0))
ggcorr(Houses_smth, hjust = 0.85, size = 3, color = "black", type = "upper", layout.exp = 2,
label = TRUE, label_size = 2.5, low = "yellow", mid = "orange", high = "red")+
labs(title = "Correlation Heat-Map between variables") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
## Warning in ggcorr(Houses_smth, hjust = 0.85, size = 3, color = "black", : data
## in column(s) 'floors', 'waterfront', 'view', 'condition' are not numeric and
## were ignored
## Warning: Ignoring unknown parameters: type